/*
this do file takes the September 29 2019 version of eWIC implementation created
by Rebecca Dickerson and modified in Nov 2019 and Sep 2023 by CA (add some states and check
and revise work done by Rebecca and Ashley)
*/
clear all
set more off
*make sure egenmore is installed, will use eomd function later
cap install egenmore
local dir "./data/raw"
local log_dir ./documentation/logs
local date: display %tdYY-NN-DD date(c(current_date), "DMY")
di "`date'"
capture log close

log using `log_dir'/clean_ewic_imp`date', replace
*there are "/"s in variable names from excel - but we're not going to use those
* solution is to not use firstrow option, clean first row into friendlier varnames, and then change firstrow to varnames
import excel "`dir'/EWic implementation.xlsx", sheet("Square Data Set") allstring firstrow cellrange(A1:CT3192)
*drop vars won't use
drop H-CN CR-CT
* cleaning varnames
foreach var of varlist _all{
		local newname = lower("`var'")
		di "`newname'"
		rename `var' `newname'
	}
replace date = "" if date == "N/A"
* recode some characters in pilot stage so can destring
replace pilotstage1 = "" if pilotstage1 == "-"
replace pilotstage2 = "" if pilotstage2 == "-"
* don't want missing fips codes or missing dates, don't have enough info to be useful
drop if missing(unique) | missing(date) 
destring unique fips* pilot* confidence, replace
* assume that if missing info on pilot stage it's not a pilot county
foreach var of varlist pilot* {
		recode `var' missing = 0
	}
* the two states we may want to exclude later b/c date information is incomplete are kentucky and nevada
replace confidence = (state != "Kentucky" & state != "Nevada")
*only states that should have confidence zero are nevada and kentucky; exclude these later to see how sensitive results are to these data
rename uniqueid fipscode
* create start and end dates for the range of rollout dates
split datetobe, parse("-")
rename datetobe~1 rollout_start
rename datetobe~2 rollout_end
*deal with some places having 2 digits formatted years rather than 4 digits
foreach var of varlist rollout_*{
		gen yrlen_`var' = strlen(`var') - strrpos(`var', "/")
		replace `var' = subinstr(`var', substr(`var', strrpos(`var', "/"), strlen(`var')), "/" + "20" + substr(`var', strrpos(`var', "/") + 1, strlen(`var')), 1) if yrlen_`var' == 2
		/*
		typo: 2109 entered as year for some counties. assume that this was supposed to
		be 2019 and fix
		*/
		replace `var' = subinstr(`var', "2109", "2019", 1)
		gen `var'_date = date(`var', "MDY")
		format *_date %td
		drop yrlen*
	}
order fipscode date rollout*
*expand data to have one obs per day of fy 2005-fy 2020 (the tdl/tip data window)
*this means 16*365 + 5 (16 years, 365 days, plus 4 leap days)
expand 5845
bys fipscode: gen date = date("01/10/2004", "DMY") + _n - 1
format date %td
replace rollout_end_date = rollout_start_date if missing(rollout_end_date)
*ewicimp set to one if on or after rollout_end_date
gen ewicimp = (inrange(date, rollout_end_date, . ))
*your probability of being turned on during a range is the number of days since the range started divided by the length of the range
replace ewicimp = (date-rollout_start_date + 1)/(1 + rollout_end_date - rollout_start_date) if inrange(date, rollout_start_date, rollout_end_date)
*then downweight the first two months of the rollout
*first need the number of days in the month that rollout started (use end of month day for this)
egen rollout_m1_eomd = eomd(rollout_start_date)
*replace ewicimp with 1/3 of ewicimp in first month to account for approximately 1/3 of participants
*	visiting clinic and getting new method of benefits in first month
gen rewt_ewicimp = ewicimp
replace rewt_ewicimp = rewt_ewicimp*(1/3) if ///
	inrange(date, rollout_start_date, rollout_start_date + day(rollout_m1_eomd) - 1)
* will also need the number of days in the month after rollout started (use lag option in eomd)
egen rollout_m2_eomd = eomd(rollout_start_date), lag(-1)
*replace ewicimp with 2/3 of ewicimp in second month to account for approximately 1/3 of participants
*	visiting clinic and getting new method of benefits in second month, on top of those who got it in 
*	first month
replace rewt_ewicimp = rewt_ewicimp*(2/3) if ///
	inrange(date, rollout_start_date + day(rollout_m1_eomd), rollout_start_date + day(rollout_m1_eomd) + day(rollout_m2_eomd) - 1)
drop rollout_m1_eomd rollout_m2_eomd
order fipscode date *_date ewicimp rewt_ewicimp
*TDL data comes from stores that are open/closed in june of a particular year
*TIP data comes from stores authorized in a fiscal year
*this means it probably makes sense to condense ewic implementation down to the federal fiscal year level (oct_{t-1} - sep_t)
*do this as the share of the fiscal year that the county has ewic
*get data to fiscal year level: do this as share of the year 
gen year = yofd(date) if inrange(month(date), 1, 9)
replace year = yofd(date) + 1 if inrange(month(date), 10, 12)
collapse (mean) ewicmean = ewicimp rewt_ewicmean = rewt_ewicimp (median) ewicmedian = ewicimp rewt_ewicmedian = rewt_ewicimp (min) ewicmin = ewicimp rewt_ewicmin = rewt_ewicimp (max) ewicmax = ewicimp rewt_ewicmax = rewt_ewicimp (first) state rollout_start_date rollout_end_date, by(fipscode year)
order fipscode year *ewicmea* *ewicmedia* *ewicmi* *ewicma*
rename fipscode ct_fips
*event time formatting
*since we have a substantial gap in the time aggregation of our outcome and our treatment (treatment at day level, outcome at year level), we will treat the year in which the zip first ever has some ewic as the year of the event (event_time == 0)
gen ev_year = yofd(rollout_start_date) if inrange(month(rollout_start_date), 1, 9)
replace ev_year = yofd(rollout_start_date) + 1 if inrange(month(rollout_start_date), 10, 12)
gen ev_time = year - ev_year
*create pre/post dummies, with bins at -7/7
gen pre7 = inrange(ev_time, -20, -7)
forval t = -6/-1{
		local s = abs(`t')
		gen pre`s' = (ev_time == `t')
	}
forval t = 0/6{
		gen post`t' = (ev_time == `t')
	}
gen post7 = inrange(ev_time, 7, 20)

save "./data/cleaned/ewic_rollout.dta", replace